In [1]:
import os
import Bio
import re
import timeit
import copy
from Bio import SeqIO
from Bio.Blast import NCBIXML
from Bio import Restriction
from Bio.Restriction import *
from Bio.Alphabet.IUPAC import IUPACAmbiguousDNA
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio import SeqFeature
from Bio.SeqFeature import *
import pandas
In [2]:
# Load in the Sanger sequencing data.
lib5pr = []
for filename in os.listdir("abifiles/5prlib"):
handle = open("abifiles/5prlib" + "/" + filename, 'rb')
record = SeqIO.read(handle, "abi", alphabet=IUPACAmbiguousDNA())
lib5pr.append(record)
In [3]:
# Define the filter that delineates the end of a target 20mer. This uses the first 9nt of the sgRNA hairpin.
# Results in 45 targets for 5 pr.
sgRNAconst = SeqRecord(Seq("GTTTAAGAG"))
sgRNAfiltlib5pr = copy.deepcopy(lib5pr)
# Makes a Feature for each of the marker 9nt sgRNA signatures in the Sanger sequencing files.
for seqrecord in sgRNAfiltlib5pr:
fwdlocs = []
revlocs = []
fwdlocs = [tloc.start() for tloc in re.finditer(str(sgRNAconst.seq), str(seqrecord.seq))]
for item in fwdlocs:
start = ExactPosition(int(item))
end = ExactPosition(int((item) + len(sgRNAconst) + 1))
location = FeatureLocation(start, end)
feature = SeqFeature(location,type="sgRNAconst", strand = +1)
seqrecord.features.append(feature)
revlocs = [tloc.start() for tloc in re.finditer(str(sgRNAconst.reverse_complement().seq), str(seqrecord.seq))]
for item in revlocs:
start = ExactPosition(int(item) - 1)
end = ExactPosition(start + len(sgRNAconst))
location = FeatureLocation(start, end)
feature = SeqFeature(location,type="sgRNAconst", strand = -1)
seqrecord.features.append(feature)
#Make a new list (SeqRecords) out of the 21mers before first 9nt of the sgRNA hairpins
alltgts = []
for seqrecord in sgRNAfiltlib5pr:
for feat in seqrecord.features:
if feat.strand == 1:
tgtstart = int(feat.location.start) - 36 # -21
tgtend = int(feat.location.start)
sgtgt = seqrecord[tgtstart:tgtend]
alltgts.append(sgtgt)
#print "pos \n \n"
if feat.strand == -1:
tgtend = int(feat.location.end) + 36 # +21
tgtstart = int(feat.location.end)
sgtgt = seqrecord[tgtstart:tgtend].reverse_complement()
sgtgt.name=seqrecord.name
alltgts.append(sgtgt)
In [4]:
#Can these be accurately trimmed to right after the oligo-derived T7 promoter sequence? gaaatTAATACGACTCACTATAG
# Use CGACTCACTATAG
In [5]:
## NEXT UP: USE THESE CORRECTLY TRIMMED 20MERS; SEE HOW LONG THEY ARE ETC.
a = []
for item in alltgts:
try:
l = [tloc.end() for tloc in re.finditer("ACTCACTATAG", str(item.seq))]
#print l
item = item[int(l[0]):]
#print item.seq
a.append(item)
except:
l=0
alltgts = copy.deepcopy(a)
In [6]:
for item in a:
print item.seq
In [7]:
for item in a:
print len(item)
In [8]:
b = [len(i) for i in a]
b.sort()
[item for item in enumerate(b)]
Out[8]:
In [9]:
# From the new SeqRecords list of 21mers, Search against amps300; get hits:
allamps = []
#for item in SeqIO.parse("amps144masked_iter0.fasta", "fasta"):
#allamps.append(item)
for item in SeqIO.parse("amps144.fasta", "fasta"):
allamps.append(item)
for item in SeqIO.parse("theextraamps.fasta", "fasta"):
allamps.append(item)
Next: re-generate that list of cut guides that you used to BLAST before. Are the hits (within amps300) actually within the expected PAM-adjacent cut sites?
This is somewhat of a reboot: starting way back from the 21mers after T7 in sequencing data, BLAST against a newly made Amps144 db. The goal is to get the match locations and visualize them.
In [10]:
Bio.SeqIO.write(alltgts, "alltgtstemp.fa", "fasta")
blastn_cline = NcbiblastnCommandline(query="alltgtstemp.fa", db="amps144", \
task = "blastn-short",outfmt=5, out="alltgts.blast", max_target_seqs=100, num_threads = 7, evalue = 0.005)
timeit.timeit(blastn_cline, number =1)
Out[10]:
In [11]:
result_handle = open("alltgts.blast")
blast_records = NCBIXML.parse(result_handle) # use NCBIXML.parse(result_handle) for multiple queries here
blast_records_list = []
for blast_record in blast_records:
blast_records_list.append(blast_record)
result_handle.close()
In [12]:
blastsandrecords = []
for i,j in enumerate(alltgts):
blastsandrecords.append((j, blast_records_list[i]))
In [12]:
In [13]:
i = 0
for item in blastsandrecords:
print(item[0].name + " " + item[0].seq) # Print out the query seq and its title, basically
for alignment in [item[1]]:
for item in alignment.alignments:
print item.title # Print out each Amp of amps144+extra amps that a match is made on
i= i+1
for hit in item.hsps:
print("--" + str(hit)) # Within each hit amp, print out each specific hit sequence
print "\n"
print i
In [14]:
#blastsandrecords[0][1].alignments[0].hsps[0].sbjct_start
#blastsandrecords[0][1].alignments[0].hsps[0].sbjct_end
#blastsandrecords[3][1].alignments[0].title.split()[1] # splits the generated BLAST alignment title on \
# whitespace; extracts the second element which \
# corresponds to the position on the scaffold
In [14]:
In [15]:
mappable = []
for tgt, blast in blastsandrecords:
for i, j in enumerate(blast.alignments):
try:
pcr = blast.alignments[i].title.split()[1]
start = blast.alignments[i].hsps[0].sbjct_start
end = blast.alignments[i].hsps[0].sbjct_end
match = blast.alignments[i].hsps[0].match
query = blast.alignments[i].hsps[0].query
print blast.alignments[i].title
print pcr
print i
print(" " * (blast.alignments[i].hsps[0].query_start - 1) + query)
print (" " * (blast.alignments[i].hsps[0].query_start - 1) + match)
print tgt.seq + "\n"
print
mapstring = (pcr, start, end)
mappable.append((tgt, mapstring, match))
except:
pcr, start, end = 0,0,0
In [16]:
len(mappable)
Out[16]:
In [17]:
ampsdict = {}
for item in allamps:
ampsdict[item.id] = item.seq
Problem: not all BLAST hit titles (sequences) seem to map to amps300 dict entries... This was solved by changing the amps300 list that is being used. (But the individual amps are not the short versions you'd want...). Perhaps this should be revisited...
In [18]:
def printloc(queryseq, ampsdict): #lib5pr is subjectseq; t7 is queryseq
'''
This function accepts a query seq and a dictionary of subjectseqs, where the key (amp)
is contained in a field in queryseq, highlighting the location of queryseq in it.
Returns a string.
'''
subjectseq = SeqRecord(ampsdict[queryseq[1][0]])
#for seqrecord in subjectseq:
locstart = queryseq[1][1]
#print queryseq
locend = queryseq[1][2]
fwdlocs = []
revlocs = []
# Figure out which strand the BLAST hit is on
if locstart <= locend:
fwdlocs.append(locstart)
if locstart > locend:
revlocs.append(locend)
for item in fwdlocs:
start = ExactPosition(int(item))
end = ExactPosition(int((item) + len(queryseq[0].seq) + 1))
location = FeatureLocation(start, end)
feature = SeqFeature(location,type=str("cutsite_fwd"), strand = +1)
subjectseq.features.append(feature)
for item in revlocs:
start = ExactPosition(int(item) - 2)
end = ExactPosition(start + len(queryseq[0].seq) -1)
location = FeatureLocation(start, end)
feature = SeqFeature(location,type=str("cutsite_rev"), strand = -1)
subjectseq.features.append(feature)
#print subjectseq.features
mask = list((("-" * 9) + "^" )* int(round(len(subjectseq.seq)/10.0)))
for feature in subjectseq.features:
featstart = int(feature.location.start)
featend = int(feature.location.end)
if feature.strand == 1:
mask = mask[:featstart] + [">"] * int(featend-1 - featstart) + mask[featend-1:]
#context = subjectseq[featstart+1:featend+4]
context = subjectseq[featstart-10:featend+10]
if feature.strand == -1:
mask = mask[:featstart+1] + ["<"] * int(featend+1 - featstart) + mask[featend+1:]
#context = subjectseq[featstart-2:featend+2]
context = subjectseq[featstart-10:featend+10]
mask = "".join(mask)
# Add labels
masklab = list(" " * (len(subjectseq.seq)))
for feature in subjectseq.features:
featstart = int(feature.location.start)
featend = int(feature.location.end)
featname = str(feature.type)
masklab = masklab[:featstart] + list(str(featname)) + list(" " * (featend-1 - featstart - len(featname))) + masklab[featend-1:]
masklab = "".join(masklab)
#print subjectseq.name
lines = int(round(len(subjectseq.seq) / 100))
i = 0
fullstring = []
# Draw out the map, with three lines: subject seq, a marker/counter line with chevrons over features, then a
# feature label
while i <= lines:
indexstart = i*100
indexend = (i+1) * 100
if indexend > len(subjectseq.seq):
indexend = len(subjectseq.seq)
outstring = list(str(indexstart+1) + " " + subjectseq.seq[indexstart:indexend] + " " + str(indexend) + "\n" + \
str(indexstart + 1) + " " + mask[indexstart:indexend] + " " + str(indexend) + "\n" + \
str(indexstart +1) + " " + masklab[indexstart:indexend] + " " + str(indexend) + "\n")
i = i + 1
fullstring.extend(outstring)
fullstring = "".join(fullstring)
return (fullstring, context, subjectseq, fwdlocs, start, end, feature)
In [19]:
#t = printloc(mappable[3], ampsdict[mappable[3][1][0]])
t = printloc(mappable[6], ampsdict)
In [20]:
t[1]
Out[20]:
In [21]:
len(mappable)
Out[21]:
next up: trim the sequence files properly based on being followed by hairpin start. and include a map of the expected cut sites in that amplicon.
In [22]:
mappable[1]
Out[22]:
In [23]:
for number, item in enumerate(mappable):
if str("N") in item[0]:
print number
print item
In [24]:
mappable[7]
Out[24]:
In [25]:
t = printloc(mappable[10], ampsdict)
#t = printloc(mappable[10], ampsdict["9981706"])
In [26]:
print t[1].seq
In [27]:
mappable[-1]
Out[27]:
In [28]:
primers_fwd = '''\
Fwd
TCCCTTTCTTTCCCGTTACC
AGAATAGGGACCGCATTGAC
GAGAGTCGGCGAGTCCATAA
AGAGCTGAAGCACCACAGGT
GCCGGATTCACTCAGATCA
ACCAGCCAACCAGACCATTA
GCCCGCTCCATTATAACAAG
TTCGTGCAATGGACAAGTAGA
GTTGACCCAAATGACCCATC
TGGTGAAGTGTGACGTAGCC
GGAATTGACATGGACAATGG
GGCTAAACCGGAATGAACAA
TGGGTCCCAATGATGAGTCT
CAGAGCTGCCTCATGACACT
GCAATGTGATGCGAAGTGTA
TAATCAGAGACTCGCCAACG
AAAGGATTGTGGACTCATGG
TGGACGCAGTTGTCATGATT
CGAAGAGGGATGCAGGACTA
AAGCGCCCTTCTTTCCTAAT
TAACAAGCCATTTGCCACCT
TTTCTGGAAGTTTGGACACTG
TGACTCAGAGGGTAGCAACT
ACCAAGAGTCCTCATGCACT
ACAATTCGCCAATCATTGCT
TCACACTTAACTGGGAGAATG
ATCATCCTGCGCCTAAGGTT
GCACTCTACACAAAGTTCTCG
CTGGGCCCAAAGTATCTCAT
TGTCACCCACTAATGTTTCAGG
CAAAGCTCACGTCAAATAAACG
GAACCAGAATGAGTGCTGTCC
AAAGACGGCCAGTATGCAGT
CAGTGTTCATCGGAACAAAGC
AGATCTTGGAGGCCCTGTTT
TGTGATTATGCAGAGGACAACC
TGCTCAATTACGGGTTTGGT
TTGGCCATACTTCAGCCAAT
CCGACCTGAACCCTCCTAAT
CTGTCTGTCTCTACCAATCACC
TGTGCTCTGTTGATGCGTCT
TCCTTCTCAGTATGCGCTGA
TGCAAGAGCGTCTGAATTTG
TATATTGCCTGGGCGCTAAC
TGTCACAACCCACTGATTCC
CCACTGATATAGTGTGGGCTAA
GTTACTGCCGTGAGGGATGA
AGTGATGGGTCTGCCAGAAT
AAACATGGTAAGCATCTGTGG
CAGTTATGGCTGCCTCGAA
GGGATTAGGGAGGATCAGGA
GCCAGGAATTGGCAGTAGTC
GACACGGGAAAGAAACATGA
TTTCAGTAGCCGCATCAGTG
CCAATTAAGCAGATTGGAGTTC
CCTTGTAATCCTACTGTGCCTA
GGCTTGCTCTGAGAAGGCTAT
TGCTGGAGTCCACCTGATTA
GACTGAACCGTCATTCCGATA
CGCCCACCAACTGAACTTAG
AGTGTGACGTCAGAGGCAAG
TTGCATTATTATGCGCTACTGG
GTTGTGAAATCTATTGCCTCCA
AGACACAATCTAATGAGGGATG
TTGGATGAGGTTGAGGCTTA
AGACTCCTGAGAGCCCATTT
CGTGCGATTGTTTCAGGTTT
GGAACGGTGTGTATGTCCAA
CCAAACCTAGGTGGTTCTCG
GGAACACTCATTAGGGAGCA
TCTTTACAGCACCTGCTTCTGA
GAGCCGAATAAAGTGACAAA
TGTGAATCAATCTGTCTTACGC
TATGATTGAGGGCCTTGTGG
CCAGTTCCAGGTGTGCCTA
CAGTGCCCACAAGGAGTAGG
AGAATAGGTGGATTCACTGAGG
AGTTGGGCAGGCCTAACATT
CCAATGGGCAGGAACTTATG
TCATCAACAACTGGAGTCTGC
ACGATGCAGCAATTCCCTAC
GGTACTGCCATCACCCTTGT
GCAGTGTGAGCCCAACAGTA
AGCCTGGACCTCTCCTTGAT
CCCATAAGTGCCGACTTCA
CCAGAAAGTAGGAGCCGATG
TCCCGGCTCTAAAGTAGTCTTG
AAAGTCAAGGGCTGCCATC
GGGAGAGCCCTTGGAATAAA
AACGATGTACAACACCAGTTGC
GCCAAGGATGAAACCAAATC
AATGGATCAATACCCTGTCC
CCACATAGCTTCCCTGTTCTTT
TGGTCATACCACACCAATGAA
CCAAGCTAGGCTTGAACTGG
TAGCCGCTTCGCAGTTTAAT
CCACCCTTCAGACTGGCTAC
AGGAAGGACATGGAATTAACTG
AATGCCCTCAAGTAGCATGG
GTCTTGAGGAAGCAGCAACC
TTTGCCCGGTGATAGAATGT
TCATGAGTTGCATATGGATGG
GTTCATTGATGGGTGCCAGT
TGGTGAACCTGTATCAAATACG
TCAGGAATGCCTTACTTGAGA
CTTGCAGGAACTTATGAACACA
TGAATGGATCCACCACAGAA
ATCCCAAGGGAACACGTAAG
GCCCACAGATTGCATTCAC
CGGCCCTGTCTCACAGTAA
CCAGGGTATTCTAACCCTATGC
TGGAGAATCCCAAGGATGTT
AACGTGCAACCTTTGAGTCC
TCCTCCTAAAGAAACGACGTG
TCCAAGCACTCCAACCTTGT
TTTCTGATGGGCCTCTGG
TCCTCGTAAGAGGTGTTTCCA
CACCCAACTCTTATGGTGGAA
ACCCGCCTCAATACCAAAGT
TCAGAATGGCTATGGCTGTG
CTAGCGGTTTATGAGCGTCAC
CCAACTCACACTCCAATAATCA
TCCATTGTAGCCGTGCTGTA
GCATTGCAGTTCCAATCAGA
GGCTGGACAAATACCACTGC
TCGTCAGAAGTTGTCCAAGG
CCACTATGGCCAACAAGAGAG
CCTGTGGGAAGTTATGAGACG
CTATTTGACCCGCAGTTTCC
TGGTTGCTCACATCACTGAA
CTCTTTGCAGATGAGCGTGA
GGAAGCTACTGCCGTCGTTA
TCCTTTATTGTCCCGCCATA
CCACATGTGCCTTGGTAAGA
CAACAGCAATCACCCTTCAA
AGGAGGATTATTGCACCCATA
TGTGCATCACACACTCTGGA
CTGGGACCACAGGGATAAAG
GTCGCACACATAAACGCAGT
AACAGGATCGGAGAGCATTG
ACACTCATGATAGTGACCTGCT
GGGAACCGTAGAGTTTATTGTG
AGGTTCCACAAGGAGGGAGT
ACATTGGCCTTGATCCTGAG
'''
In [29]:
primers_rev = '''\
Rev
GCGCCTAAGTGTCTTTGCAT
CAAGTGCAGAGCACCTTGAC
TGCCAACGTTTGTCTCTGAC
ATCACATGTGTCTCCAGGAA
GGCAGTTGGGACGTATTTGT
GCCAGTACCTGCCAGTAACC
TTGCTGGCACATTACCACTC
TTGGCTCCTGGACTGTCTTC
GAAACAGCCGTGTCCAGAT
TTCTGCAACGAACTGTCTCTG
GTTTCGGACCCACAATGG
TGTCACTAAAGCCTAGCAGAAA
CCCAGGAGATGGTCATAATC
GCCCTGAGTATCGGCATACA
TTTGTCAGCTTGTGGACCTG
AGGAATCCTATGCTATTTCTCG
CCCATGGTCCTTACAGACTGA
GAGCGTAGCACCACTTACGG
AACTGGTATGAATGCGCAAC
CCTGCTATCTCATCTTCCTTCA
CGAAATCCGGAAATCTCTGT
TCGGTCAGAATCACATCTGC
GCACTGGGATCTCAGGTTTG
CATTCTTCACGCTTGTTCCA
GCTGAAAGATACCTGCCAACA
GGGCAGGCTCTCTTAGTCAA
GCTGCTTGAATAATTCGTCTGC
TGACCGGAAATGTTGGAAAC
CTTGCACAAGTTGCTTCACA
GCTGTACCCGTGTAGGCTTT
GGCAAAGGGCTCCAGATATAA
CCCAGGAATAGAAGTCACGTTT
CTAATTGAATGCGTTCATGC
TAGGATGCTGCCCTATGGTC
GTCACCGACCATTCATTTCA
TTTCATCCCTTGTCATCCAT
GGGCTCTGGTCAAATGAT
CTTCTGCTCATGGGTTTGGT
CCAGTCTAGTGGCCAGGATT
AATTGAAGAGGAGACCCTGCT
AGGGCATCTCCAATGGTGTA
CTGCTGTACATCCAGGCTGA
CCATTACGGATGTAGTTCAGCA
GCAGTCGAGGCTTTGAGTCT
GCAAACCTTCAGGAGCATGT
AAAGACCCAGCAGGAATTGA
GGGTGTCTTAGAGGGTAACAAA
CACTCCATACAAAGCGCTCA
AATGGAAATCGCCACTATACG
ACAGACCCGCCCTGATGAT
CATTGACATGACACATTTCTCG
AGCTCTGTCCCAGGGTATAGT
AAGCCGTAAAGTGGAAGCAG
TGAGCTAACATTCTCAAGTCCA
TAATTGGGCTGTCCTTCACC
CCAGCTCAAGTTCGAGGAAA
ACATTCGCCGTAAAGCAAAG
AAATCCATTGGGCCTGCT
AAGGGACCATCTGGGTATGT
TGACCTGTACAACACCTTGTGA
TGTGCTACTGCCATGTACCC
CCATCTTAGGCCAACTTCCA
ACCCATCCTGGCACACTGTA
ACCCAAGGGTCTCACACTTC
GGGATTGAGTCAGGTGGGTTA
ATCCAAGTCCTGCCTGAGGT
TTTCATGTGAGGTTGCCAAT
AACACTTGTGTATCGGCCATC
TGAGCAACTTATTGAGGCACA
AGGGTTAGACGACTGCCAAG
TGCATTTAGACGTTTGGTTG
TCGAACTATCATCCCGCAGT
TCACCTGATGCCTTGGTAAA
CCCTCCAAATGAAGTGACCT
CCACCCAGGATCTATTTAGAGG
TTCGGCATCGCTTATTTACG
CTTTACGGATTGGGCAAGAA
TGACCCACTCAGCATAATGAA
TTTCTGGCAAGCACTCAGAA
AAACAAGGACATGCCACACA
TTCAATCCAAACGATGCAGA
TGCACCAGTCTATTCGGTCA
CCTGCATGCCTAGGGTATATT
CCCTGTGGTTGTCTAGCGTA
CCTAAGGCGCAATAGTGTGG
CTATCCAGAACCTCCCAGCA
AGAATACCACTGCTTGCTGAGA
CCTCTGCTGGCTACAGTTTG
CCATAAACCTTGGACGCAAC
TTTGAGTTGCCTGAACGTGA
GGTCTTCTTGGCCTTCCTAAA
GGTAGATACCCGTGGAATGC
GGGAGGGTATCCACATGAGA
GGAAGTGTAAGCTAAGGCTCA
ATTTCACGGCAAGCCAATTA
TTTGTCGCGCATCACTTT
CATTCCCTAAGGCATTTGTTTC
CCCTTAGAGGACAACGGAGA
TCGAGCATGGTCTGCATTAG
ACCTCTGTTGGTCCCTATGC
GGATTACAGTGGCCATATCGTT
TTAAGGAGCTGATGATTCCAG
CTACATGCCTTGGGCTTAGG
CTCAGGGTTCCTGTGCTCTC
CCCTCTTAGGGTATACGGGTTA
AATTTGGGTCGTGCGTATGT
ACTCCACTGAGGCCCAGATA
TTGGAAGGGCCATGTATAGG
AAGGGACGGTTTAGGGTCAG
CACGTGAGCTTCGGATGTTA
TTGAATGCATAGCACCTTTG
TGGCCTTGATCCTTCAGTTC
ATTTCAAATGCCCAAACGAC
TAACACCATGGCCGAGATTT
AAGTTCTCCAGGCGAATCAG
AGGTGTTTACCGAAGGCAGA
CGAGCTGTTGGTCATTGCTA
TCCAACCATTCCAAAGTCAA
CATCAGCACAAGCAGTCGTT
TGGTCTGATGTGACGAAAGC
TCTATCCATGGAGTCATTTGG
CTGGATGGCCAACTTCTGTC
GGAGACAAGACCGTGACACA
AACCTTGGCCAGGTATTATG
CCACATTTGTAAACGGCTCA
CCCATATTTGCGACATGTGTT
CCCATCAGCATACCACCTTC
CCCAGATTCCTGCCCATT
GATTTCGGGTGCATTGTCTT
GGACAACAGCTATGGCTTGC
TGTGTGTTATGGCGATGTCC
TTGCCTATAATTGAGCCAGAGA
AAACCATAGATCCTGGTTCAG
TTCAGATACTTCATCCTCAACC
ATGTATATTCACGCCTGTGG
GCTGCACAGAGATTCGATGA
CATGTCCAGGCAGTCCAAT
GGCAGGGTCCATCTACAGTT
CCCTCTCTCGGCTCCTATCT
TGCTCTTCAAAGTGCCTCCT
GGCCATCTGAGACTTTGCAC
CTGCAGCAATGGCCTTAAAT
AACTTGAGCGCAGGGAACT
AGATCTGGCTCCCATGCAC
'''
In [30]:
import io
primers_fwd = pandas.read_table(io.BytesIO(primers_fwd))
primers_rev = pandas.read_table(io.BytesIO(primers_rev))
In [31]:
fwdprimerlist = []
for index,item in [list(x) for x in primers_fwd.itertuples()]:
fwdprimerlist.append(item)
revprimerlist = []
for index,item in [list(x) for x in primers_rev.itertuples()]:
revprimerlist.append(item)
In [32]:
revprimersreversed = []
for item in revprimerlist:
item = Seq(item, IUPACAmbiguousDNA()).reverse_complement()
revprimersreversed.append(item)
revprimerslist = revprimersreversed
In [33]:
revprimerlist = copy.deepcopy(revprimersreversed)
In [34]:
len(revprimersreversed)
Out[34]:
In [35]:
f = []
for item in fwdprimerlist:
item = Seq(item, IUPACAmbiguousDNA())
f.append(item)
fwdprimerlist = copy.deepcopy(f)
In [36]:
fwdprimerlist
Out[36]:
In [37]:
ampsdict["10206893"]
Out[37]:
In [38]:
fwdprimerlist.extend(revprimerlist)
In [39]:
f = []
for index, item in enumerate(fwdprimerlist):
item = SeqRecord(item)
item.id = str(index)
item.description = str(index)
item.name = str(index)
f.append(item)
In [40]:
with open("288primers.fasta", "w") as handle:
SeqIO.write(f, handle, "fasta")
Made a BLAST database from 288 primers
In [41]:
# To match against the primer BLAST database, which of the mappable 20mers are basically fragment ends?
# First, write out a query fasta containing the mappable targets:
m = [j for j,k,l in copy.deepcopy(mappable)]
In [42]:
for item in m:
item.id = item.name
In [43]:
Bio.SeqIO.write(m, "mappablesequencedtgts.fasta", "fasta")
Out[43]:
In [44]:
blastn_cline = NcbiblastnCommandline(query="mappablesequencedtgts.fasta", db="288prim", \
task = "blastn-short",outfmt=5, out="288prim.blast", max_target_seqs=100, num_threads = 7, evalue = 0.01)
timeit.timeit(blastn_cline, number =1)
Out[44]:
In [45]:
result_handle = open("288prim.blast")
prim_blast_records = NCBIXML.parse(result_handle) # use NCBIXML.parse(result_handle) for multiple queries here
prim_blast_records_list = []
for blast_record in prim_blast_records:
prim_blast_records_list.append(blast_record)
result_handle.close()
In [46]:
counter = 0
for item in prim_blast_records_list:
for i, h in enumerate(item.alignments):
print i
for j in h.hsps:
print j
counter = counter + 1
print counter
In [46]:
In [47]:
# Generate a list of the mappable targets that didn't get matches against the primer library
mappable_noends = []
for item in prim_blast_records_list:
if len(item.alignments) == 0:
mappable_noends.append(item.query.split()[0])
So, 25 of the 42 mappable matches are the ends of PCR products. Next, filter the mappable list to remove these. Then check PAM-adjacency.
In [48]:
len(mappable_noends)
Out[48]:
In [49]:
len(mappable[0])
Out[49]:
In [50]:
d = []
for item, j, k in mappable:
if item.name in mappable_noends:
d.append((item, j, k))
In [51]:
mappable[0]
Out[51]:
In [52]:
pamlist = []
for item in d:
t = printloc(item, ampsdict)
if t[1].features[0].strand == 1:
pamlist.append((t, t[2][t[-2]-1:t[-2]+2]))
if t[1].features[0].strand == -1:
pamlist.append((t, t[2][t[-3]-2:t[-3]+1].reverse_complement()))
In [53]:
print(t[0])
In [54]:
print(t[1])
In [55]:
for item, (j, otherjunk) in enumerate(pamlist):
print item
print otherjunk
print "\n"
In [56]:
print pamlist[14][0][0]
In this analysis, 6 items are not next to NAG/NGG. But 3 are probably leftover fragment end dupes, leaving 14/17 correct, or an 82% yield.
1 is in a primer (989347).
6 (141439)
7 (not in primer...),
8 (not in primer),
9 (not in primer)
14. (is in 4995367)
In [56]:
In [ ]: